---
title: "1-Preparing data for analysis"
author: "Alvaro Passi-Solar"
date: "`r Sys.Date()`"
output: html_document
---

```{r, include=F}
library(tidyverse)
library(metafor)
rm(list=ls())
df0<-rio::import("2024.05.04. Extraccion_ANX_PTSD.xlsx")
options(scipen = 999)
```


# 1.0 - Data prep: col names,  col types, HDI and GII as %


```{r, include=F}
names(df0)<-gsub(" ","",names(df0))
names(df0)<-gsub("[","",names(df0),fixed=T)
names(df0)<-gsub("]","",names(df0),fixed=T)
names(df0)<-gsub("(","",names(df0),fixed=T)
names(df0)<-gsub(")","",names(df0),fixed=T)
names(df0)<-gsub("%","",names(df0),fixed=T)
names(df0)<-gsub("95","c95",names(df0),fixed=T)

df0$author<-df0$Author
df0$year<-df0$Year

df0$Incluye_OCD<-df0$IncluyeOCD
df0$Incluye_PTSD<-df0$IncluyePTSD
df0$Incluye_SP<-df0$IncluyeSP


df0$ID<-df0$PLOT
df0$ID_AP<-paste0(df0$ID," (",df0$NewID,"-",df0$Estudio,")") # id for plots


df0$Gini<-as.numeric(df0$Gini)
df0$Homicide_Rate<-as.numeric(df0$Homicide_Rate)
df0$HDI_n<-as.numeric(df0$HDI_n)
df0$HDI_health<-as.numeric(df0$HDI_health)
df0$HDI_edu<-as.numeric(df0$HDI_edu)
df0$HDI_inc<-as.numeric(df0$HDI_inc)
df0$GII<-as.numeric(df0$GII)


df0$HDI_n_100<-df0$HDI_n*100
df0$HDI_health_100<-df0$HDI_health*100
df0$HDI_edu_100<-df0$HDI_edu*100
df0$HDI_inc_100<-df0$HDI_inc*100
df0$GII_100<-df0$GII*100

df0$MAALL_js<-df0$MAALL_j
df0$MAALL_js[!is.na(df0$MASex_j)]<-paste0(df0$MASex_j[!is.na(df0$MASex_j)],"_",df0$Sex[!is.na(df0$MASex_j)])

# Ensure that decimal points are represented with a dot (.) rather than a comma (,)

df0$c95LI<-gsub(",",".",df0$c95LI, fixed = T)
df0$c95LS<-gsub(",",".",df0$c95LS, fixed = T)

df0$c95LIp<-as.numeric(df0$c95LI)/100
df0$c95LSp<-as.numeric(df0$c95LS)/100
df0$c95LIp_o<-df0$c95LIp
df0$c95LSp_o<-df0$c95LSp


df0$P<-as.numeric(as.character(df0$Prevalence))/100
df0$total<-df0$N

df0$cases<-df0$P*df0$total

df0$SE_o<-as.numeric(df0$SE)/100
df0$SE<-as.numeric(df0$SE)/100


df0<-df0 %>%
  select(ID_AP,c95LIp,c95LSp, P, SE,N,total, n, everything())
```

```{r}
# table(df0$Prevalence)
```

# 1.1 - Continuity correction


```{r, include=F}
# To address cases with a reported proportion (P) of 0 or when the estimated number of cases (from the reported sample size and prevalence) was less than 0.5, a continuity correction was applied. Specifically, for any observation where the estimated number of cases was below 0.5, a small adjustment was made by adding 0.5 to the number of cases and updating the total number of observations accordingly. This correction was necessary for 106 out of 1,715 observations.

df0$comment_continuity<-NA
df0$cases_o<-df0$cases
table(df0$cases_o<0.5 ) #106

df0$total_o<-df0$total

df0$P_o<-df0$P

df0$cases[df0$cases_o<0.5 & 
            !is.na(df0$cases_o)]<-0.5

df0$total[df0$cases_o<0.5 & !is.na(df0$total) & !is.na(df0$cases_o)]<-df0$total[df0$cases_o<0.5 & !is.na(df0$total) & !is.na(df0$cases_o)]+0.5
df0$comment_continuity[df0$cases_o<0.5]<-"continuity correction + n=0.5"
df0$P[!is.na(df0$cases) & !is.na(df0$total) ] <- df0$cases[!is.na(df0$cases) & !is.na(df0$total) ]/df0$total[!is.na(df0$cases) & !is.na(df0$total) ]


```

# 1.2 - Set SE=0 and identical 95% CI as missing 

```{r, include=F}
# SE==0, 30 obs Set as missing. Values are calculated later in this code.
df0$SE[df0$SE==0]<-NA

# equal 95CI,Set as missing. Values are calculated later in this code.
df0$comment_ci_identical<-NA
df0$comment_ci_identical<-ifelse(df0$c95LIp==df0$c95LSp,"CI identical, SE=0",NA)
table(df0$comment_ci_identical)

df0$c95LIp[!is.na(df0$c95LIp_o) &!is.na(df0$c95LSp_o) & df0$c95LIp_o==df0$c95LSp_o]<-NA
df0$c95LSp[!is.na(df0$c95LIp_o) &!is.na(df0$c95LSp_o) & df0$c95LIp_o==df0$c95LSp_o]<-NA
# table(df0$comment_ci_identical)
```

# 1.3 - Calculate from P and valid SE


```{r, include=F}
# 454 obs 95%CI calculated from SE
df0$logitp<-log(df0$P/(1-df0$P))
df0$logitp_SE<-df0$SE/(df0$P*(1-df0$P))

df0$c95LIp_SE<-exp(df0$logitp+-df0$logitp_SE*qnorm(0.975))/(1+exp(df0$logitp-df0$logitp_SE*qnorm(0.975)))
df0$c95LSp_SE<-exp(df0$logitp+df0$logitp_SE*qnorm(0.975))/(1+exp(df0$logitp+df0$logitp_SE*qnorm(0.975)))
table(!is.na(df0$c95LIp_SE[is.na(df0$c95LIp)]), useNA = "always") 


df0$c95LIp[is.na(df0$c95LIp)]<-df0$c95LIp_SE[is.na(df0$c95LIp)]
df0$c95LSp[is.na(df0$c95LSp)]<-df0$c95LSp_SE[is.na(df0$c95LSp)]

df0$var_CI<-((df0$c95LSp-df0$c95LIp)/(qnorm(0.975)*2))^2
df0$vi_CI<-(sqrt(df0$var_CI)/(df0$P*(1-df0$P)))^2
```


# 1.4 - Calculate missing total from 95% CI

```{r, include=F}
# For 24 studies that did not report the total number of observations, the totals were estimated using either the reported 95% confidence intervals (CI) or by calculating the 95% CI based on the reported standard errors (SE).

df0$n_var<-(df0$P*(1-df0$P))/df0$var_CI

df0$comment[is.na(df0$total) & !is.na(df0$n_var)]<-"total and cases calculated from 95CI"

df0$total[is.na(df0$total)]<-df0$n_var[is.na(df0$total)]
df0$cases<-df0$P*df0$total

```





# 1.5 - Calculate vi according to PLO, n/total


```{r, include=F}
# 529 vi were calculated from PLO
# If the variance (vi) was calculated from the reported confidence intervals (CI) or standard errors (SE), that value was used. If neither was available, the PLO (Peto's Log Odds) method was applied to estimate the variance.

df0=escalc(xi=cases,
           ni=total,
           # sei=SE,
           measure="PLO",
           data=df0,
           add=1/2) # same values when add=0



df0$vi_PLO<-df0$vi

df0$vi[!is.na(df0$vi_CI)]<-df0$vi_CI[!is.na(df0$vi_CI)]


```



# 1.6 - Calculate final 95% CI

```{r, include=F}

# If the 95% confidence interval (CI) was missing, the PLO method was used to estimate the CI, and that value was assigned.

df0$var_log<-(1/df0$cases)+(1/(df0$total-df0$cases))
df0$sup_plo<-df0$logitp+sqrt(df0$var_log)*qnorm(0.975)
df0$sup_plo[is.infinite(df0$logitp)]<-df0$yi[is.infinite(df0$logitp)]+sqrt(df0$vi[is.infinite(df0$logitp)])*qnorm(0.975)
df0$inf_plo<-df0$logitp-sqrt(df0$var_log)*qnorm(0.975)
df0$inf_plo[is.infinite(df0$logitp)]<-df0$yi[is.infinite(df0$logitp)]-sqrt(df0$vi[is.infinite(df0$logitp)])*qnorm(0.975)
df0$sup_p_plo<-exp(df0$sup_plo)/(1+exp(df0$sup_plo))
df0$inf_p_plo<-exp(df0$inf_plo)/(1+exp(df0$inf_plo))


df0$comment3[is.na(df0$vi_CI) & !is.na(df0$vi_PLO)]<-"vi calculated from PLO n/cases"


df0$sup<-df0$logitp+sqrt(df0$vi_CI)*qnorm(0.975)
df0$inf<-df0$logitp-sqrt(df0$vi_CI)*qnorm(0.975)
df0$sup_p<-exp(df0$sup)/(1+exp(df0$sup))
df0$inf_p<-exp(df0$inf)/(1+exp(df0$inf))

ts1<-df0 %>%
  select(ID_AP, c95LIp_o, c95LIp,  inf_p )


df0$c95LIp[is.na(df0$c95LIp)]<-df0$inf_p_plo[is.na(df0$c95LIp)]
df0$c95LSp[is.na(df0$c95LSp)]<-df0$sup_p_plo[is.na(df0$c95LSp)]


df0$inf_p[!is.na(df0$c95LIp)]<-df0$c95LIp[!is.na(df0$c95LIp)]
df0$sup_p[!is.na(df0$c95LSp)]<-df0$c95LSp[!is.na(df0$c95LSp)]

df0$inf_p[is.na(df0$inf_p)]<-df0$inf_p_plo[is.na(df0$inf_p)]
df0$sup_p[is.na(df0$sup_p)]<-df0$sup_p_plo[is.na(df0$sup_p)]

df0$inf_p[is.na(df0$inf_p)]<-df0$c95LIp[is.na(df0$inf_p)]
df0$sup_p[is.na(df0$sup_p)]<-df0$c95LSp[is.na(df0$sup_p)]

df0$inf_p[!is.na(df0$comment_continuity)]<-df0$inf_p_plo[!is.na(df0$comment_continuity)]
df0$sup_p[!is.na(df0$comment_continuity)]<-df0$sup_p_plo[!is.na(df0$comment_continuity)]

ts1<-df0 %>%
  select(ID_AP, c95LIp_o, c95LIp,  inf_p )
```

# 1.7 - Calculate final SE

```{r, include=F}
df0$SE_CI<-(df0$sup_p-df0$inf_p)/qnorm(0.975)*2
df0$comment2[is.na(df0$SE) & !is.na(df0$SE_CI)]<-"SE calculated from CI"

ts1<-df0 %>%
  select(ID_AP,SE, SE_CI)

df0$SE[is.na(df0$SE)]<-df0$SE_CI[is.na(df0$SE)]


ts1<-df0 %>%
  select(ID_AP,Prevalence,P, c95LIp_o, c95LIp, inf_p_plo, inf_p,  SE, SE_o, SE_CI, cases, cases_o, total, total_o, vi, vi_CI, )
```


# 1.8 - Final list of comments

```{r, include=F}
df0$comments<-paste(df0$comment, df0$comment2, df0$comment3,  df0$comment_continuity, df0$comment_ci_identical)
df0$comments<-gsub("NA ","",df0$comments)
df0$comments<-gsub("NA","",df0$comments)


comments<-df0 %>%
  subset(!is.na(comments))%>%
  subset(comments!="")%>%
  select(comments,everything())

ts1<-df0 %>%
  select(ID_AP,Prevalence,P, c95LIp_o, c95LIp,  SE, SE_o, cases, cases_o, total, total_o, vi, vi_CI, comments)

```





# 1.9 - Labels for prevalence

```{r, include=F}


df0$label<-paste0(round(df0$P*100,2)," (",
                  round(df0$inf_p*100,2),
                  "-",
                  round(df0$sup_p*100,2)
                  ,")")


df0 <- df0 %>%
  select(comments,ID_AP, "label",P,SE_o,SE,SE_CI,c95LI, c95LS ,inf_p, sup_p,everything(),"ID")

df0$row_id<-as.numeric(row.names(df0))


```


# 1.10 - Save final database

```{r, include=F}
# drop 2 obs with missing prevalence
df0s<-df0 %>%
  subset(!is.na(P))


write.csv(df0s,"df0_SR.csv")

```



